CC BY 4.0
This document is licensed under a Creative Commons Attribution 4.0 International license.
This license applies to both the compiled html document as well as the original R markdown code file.
library(tidyverse)
library(RColorBrewer)
library(tidyverse)
library(gridExtra)
library(MASS)
library(rms) # for splines
library(glmnet) # for ridge, lasso, elastic net
library(caret) # for various methods
library(mvtnorm) # for computing bivariate normal density
library(RSNNS)
set.seed(12345) # for reproducibility
x<-runif(100,min=0.25,max=3)
y<-10-2/x+rnorm(length(x),mean=0,sd=1)
regDat<-data.frame(x=x,y=y)
xReg<-seq(0.25,3,length=1000)
yReg<-10-2/xReg
regDatNoNoise<-data.frame(xReg,yReg)
for(j in 1:6){
regDat[,paste(sep="","v",j)]<-rnorm(length(x)) # for the penalisation methods
}
classDat<-data.frame(class=factor(c(rep("c1",50),rep("c2",100))),x=NA,y=NA)
classDat[classDat$class=="c1",c("x","y")]<-mvrnorm(sum(classDat$class=="c1"),mu=c(1,2),Sigma=diag(1,2))
classDat[classDat$class=="c2",c("x","y")]<-mvrnorm(sum(classDat$class=="c2"),mu=c(2,1),Sigma=matrix(byrow=T,nrow=2,c(0.5,-0.2,-0.2,0.5)))
newRegDat<-data.frame(x=seq(0.25,3,length=1000))
newClassDat<-expand.grid(x=seq(-1,4.5,length=240),y=seq(-1,4.5,length=240))
save(regDat,classDat,newRegDat,newClassDat,file=paste(sep="","regClassDat_",Sys.Date(),".RData"))
# simple data plots
p0a<-ggplot() +
geom_point(data=regDat,mapping=aes(x=x,y=y),size=3) +
theme_light() +
theme(text=element_text(size=12)) +
ggtitle("Regression data.")
p0aAnnot<-p0a +
geom_line(data=regDatNoNoise,mapping=aes(x=xReg,y=yReg),col="steelblue") +
labs(caption="True relationship is y = 10 - 2/x.")
p0b<-ggplot() +
geom_point(data=classDat,mapping=aes(col=class,x=x,y=y,pch=class),size=3) +
xlim(-1,4.25) + ylim(-1,4.25) +
labs(title="Classification / clustering data.") +
theme_light() +
theme(text=element_text(size=12))
grid.arrange(p0a,p0b+coord_fixed(ratio = 1),ncol=2)
densC1<-dmvnorm(x=newClassDat,mean=c(1,2),sigma=diag(1,2))
densC2<-dmvnorm(x=newClassDat,mean=c(2,1),sigma=matrix(byrow=T,nrow=2,c(0.5,-0.2,-0.2,0.5)))
priorC1<-50/150
priorC2<-100/150
postC1<-priorC1*densC1/(priorC1*densC1+priorC2*densC2)
newClassDat$bayesClassif<-factor(ifelse(postC1>0.5,"c1","c2"))
newClassDat$bayesClassifNum<-postC1
newClassDat$trueC1dens<-densC1
newClassDat$trueC2dens<-densC2
p0bwithBayesClassif<-p0b +
geom_point(mapping=aes(x=x,y=y,col=bayesClassif),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=bayesClassifNum),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
geom_contour(mapping=aes(x=x,y=y,z=trueC1dens),data=newClassDat,color="red",lwd=0.5) +
geom_contour(mapping=aes(x=x,y=y,z=trueC2dens),data=newClassDat,color="blue",lwd=0.5) +
ggtitle("Bayes classifier (with true generating distributions)\nclassification boundary")
grid.arrange(p0aAnnot,p0bwithBayesClassif+coord_fixed(ratio = 1),ncol=2)
pdf("keyTechs_0_data.pdf",width=30,height=10)
grid.arrange(p0a+theme(text=element_text(size=20)),p0b+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
pdf("keyTechs_0_dataAnnotated.pdf",width=30,height=10)
grid.arrange(p0aAnnot+theme(text=element_text(size=20)),p0bwithBayesClassif+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
m1<-lm(y~x,data=regDat)
pred1<-data.frame(predict(m1,newdata=newRegDat,interval="confidence"))
newRegDat$lwr<-pred1$lwr
newRegDat$upr<-pred1$upr
newRegDat$fit<-pred1$fit
polyDat<-data.frame(x=c(newRegDat$x,newRegDat$x[nrow(newRegDat):1]),y=c(newRegDat$lwr,newRegDat$upr[nrow(newRegDat):1]))
p1reg<-p0a +
geom_polygon(data=polyDat,mapping=aes(x=x,y=y),color=NA,fill="darkgrey",alpha=0.5) +
geom_line(data=newRegDat,mapping=aes(x=x,y=fit), color="blue", linetype="solid") +
ggtitle("GLM")
print(p1reg)
m1class<-glm(family="binomial",class~x+y,data=classDat)
newClassDat$pred1class<-predict(m1class,newdata=newClassDat,type="response")
newClassDat$pred1classBin<-factor(ifelse(newClassDat$pred1class>0.5,"c2","c1"))
p1class<- p0b +
geom_point(mapping=aes(x=x,y=y,col=pred1classBin),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred1class),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
ggtitle("GLM - logistic regression classification boundary")
print(p1class+coord_fixed(ratio = 1))
pdf("keyTechs_1_glm.pdf",width=30,height=10)
grid.arrange(p1reg+theme(text=element_text(size=20)),p1class+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
m2a<-lm(y~lsp(x,1),data=regDat)
pred2a<-data.frame(predict(m2a,newdata=newRegDat,interval="confidence"))
newRegDat$lwr<-pred2a$lwr
newRegDat$upr<-pred2a$upr
newRegDat$fit<-pred2a$fit
polyDat<-data.frame(x=c(newRegDat$x,newRegDat$x[nrow(newRegDat):1]),y=c(newRegDat$lwr,newRegDat$upr[nrow(newRegDat):1]))
p2a<-p0a +
geom_polygon(data=polyDat,mapping=aes(x=x,y=y),color=NA,fill="darkgrey",alpha=0.5) +
geom_line(data=newRegDat,mapping=aes(x=x,y=fit), color="blue", linetype="solid") +
ggtitle("Linear spline (1 knot)")
m2b<-lm(y~rcs(x,3),data=regDat)
pred2b<-data.frame(predict(m2b,newdata=newRegDat,interval="confidence"))
newRegDat$lwr<-pred2b$lwr
newRegDat$upr<-pred2b$upr
newRegDat$fit<-pred2b$fit
polyDat<-data.frame(x=c(newRegDat$x,newRegDat$x[nrow(newRegDat):1]),y=c(newRegDat$lwr,newRegDat$upr[nrow(newRegDat):1]))
p2b<-p0a +
geom_polygon(data=polyDat,mapping=aes(x=x,y=y),color=NA,fill="darkgrey",alpha=0.5) +
geom_line(data=newRegDat,mapping=aes(x=x,y=fit), color="blue", linetype="solid") +
ggtitle("Restricted cubic spline (3 knots)")
m2class<-glm(family="binomial",class~rcs(x,3)+rcs(y,3),data=classDat)
newClassDat$pred2class<-predict(m2class,newdata=newClassDat,type="response")
newClassDat$pred2classBin<-factor(ifelse(newClassDat$pred2class>0.5,"c2","c1"))
p2class<- p0b +
geom_point(mapping=aes(x=x,y=y,col=pred2classBin),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred2class),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
ggtitle("GLM - logistic regression with RCS classification boundary")
grid.arrange(p2a,p2b,ncol=2)
print(p2class)
pdf("keyTechs_2_GAM_reg.pdf",width=30,height=10)
grid.arrange(p2a+theme(text=element_text(size=20)),p2b+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
pdf("keyTechs_2_GAM_class.pdf",width=15,height=10)
print(p2class+theme(text=element_text(size=20)))
dev.off()
## quartz_off_screen
## 2
m3_elnet<-glmnet(y=regDat$y,x=as.matrix(regDat[,-2]),alpha=0.5)
m3_ridge<-glmnet(y=regDat$y,x=as.matrix(regDat[,-2]),alpha=0)
m3_lasso<-glmnet(y=regDat$y,x=as.matrix(regDat[,-2]),alpha=1)
par(mfrow=c(1,2),mar=c(5,4,6,1))
plot(m3_ridge,main="ridge regression",label=T,xvar="lambda",lwd=2,cex=1.5)
plot(m3_lasso,main="lasso regression",label=T,xvar="norm",lwd=2,cex=1.5)
pdf("keyTechs_3_regularisation.pdf",width=30,height=10)
par(mfrow=c(1,2),mar=c(5,4,6,1))
plot(m3_ridge,main="ridge regression",label=T,xvar="lambda",lwd=2,cex=2.5)
plot(m3_lasso,main="lasso regression",label=T,xvar="norm",lwd=2,cex=2.5)
dev.off()
## quartz_off_screen
## 2
m4knn<-caret::train(class ~ x + y, data = classDat,
method = "knn",
preProcess = c("center", "scale"), # (centering &) scaling recommended as otherwise variables on large numerical scales will dominate the distance metric
tuneGrid=data.frame(k=15)) # number of nearest neighbours; could also have R select this for us via CV but for illustration purposes we fix this here (otherwise we would first specify parameters for the CV...)
newClassDat$pred4knn<-predict(m4knn,newdata=newClassDat)
newClassDat$pred4knnNum<-as.integer(newClassDat$pred4knn)-1
p4knn<-p0b +
geom_point(mapping=aes(x=x,y=y,col=pred4knn),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred4knnNum),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
ggtitle("nearest neighbour (k=15) classification boundary")
print(p4knn+coord_fixed(ratio = 1))
m4knnReg<-caret::train(y ~ x, data = regDat,
method = "knn",
preProcess = c("center", "scale"), # (centering &) scaling recommended as otherwise variables on large numerical scales will dominate the distance metric
tuneGrid=data.frame(k=15)) # number of nearest neighbours; could also have R select this for us via CV but for illustration purposes we fix this here (otherwise we would first specify parameters for the CV...)
newRegDat$pred4knnReg<-predict(m4knnReg,newdata=newRegDat)
p4knnReg<-p0a +
geom_line(data=newRegDat,mapping=aes(x=x,y=pred4knnReg), color="blue", linetype="solid") +
ggtitle("kNN regression (k=15)")
print(p4knnReg)
pdf("keyTechs_4_kNN.pdf",width=30,height=10)
grid.arrange(p4knnReg+theme(text=element_text(size=20)),p4knn+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
m4kmeans<-kmeans(classDat[,c("x","y")], centers=2, nstart=20) # in practice, just as for kNN, you would probably want to do first center & scale....
classDat$kmeans<-factor(m4kmeans$cluster)
classC1<-as.character(names(sort(table(classDat$class[classDat$kmeans==1]),decreasing=TRUE))[1]) # this work here, but could backfire if one class is overwhelmingly more common than the other and both clusters aredominated by that class...
classC2<-as.character(names(sort(table(classDat$class[classDat$kmeans==2]),decreasing=TRUE))[1]) # this work here, but could backfire if one class is overwhelmingly more common than the other and both clusters aredominated by that class...
nearestCentroid<-function(x,kmeans) {
centDist<-apply(kmeans$centers, MARGIN=1, FUN=function(y){sqrt(sum((x-y)^2))})
return(which.min(centDist)[1])
}
newClassDat$pred4kmeans<-factor(ifelse(apply(newClassDat[,c("x","y")],MARGIN=1,FUN=nearestCentroid,kmeans=m4kmeans)==1,classC1,classC2))
newClassDat$pred4kmeansNum<-as.integer(newClassDat$pred4kmeans)-1
p4kmeansClust<-ggplot() +
geom_point(data=classDat,mapping=aes(x=x,y=y,pch=kmeans),col="black",alpha=0.5,cex=5) +
geom_point(data=classDat,mapping=aes(x=x,y=y,col=class),cex=2) +
xlim(-1,4.25) + ylim(-1,4.25) +
ggtitle("k-means clustering") +
theme_light()
print(p4kmeansClust+coord_fixed(ratio = 1))
p4kmeansClass<-p0b +
geom_point(mapping=aes(x=x,y=y,col=pred4kmeans),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred4kmeansNum),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
ggtitle("k-means classification boundary")
print(p4kmeansClass+coord_fixed(ratio = 1))
pdf("keyTechs_4_kmeans.pdf",width=15,height=10)
print(p4kmeansClass+theme(text=element_text(size=20)))
dev.off()
## quartz_off_screen
## 2
dens1<-kde2d(x=classDat$x[classDat$class=="c1"],y=classDat$y[classDat$class=="c1"],n=300,lims=c(c(-1,4.25),c(-1,4.25)))
dens2<-kde2d(x=classDat$x[classDat$class=="c2"],y=classDat$y[classDat$class=="c2"],n=300,lims=c(c(-1,4.25),c(-1,4.25)))
prior1<-sum(classDat$class=="c1")/nrow(classDat)
prior2<-sum(classDat$class=="c2")/nrow(classDat)
class1Post<-prior1*dens1$z/(prior1*dens1$z+prior2*dens2$z) # only can do this because we evaluated dens1, dens2 on the same grid!
class2Post<-prior2*dens2$z/(prior1*dens1$z+prior2*dens2$z) # not needed; only added for the sake of completeness
class1PostVect<-as.vector(class1Post)
tmpGr<-expand.grid(dens1$x,dens1$y)
tmpClassDat<-data.frame(x=tmpGr[,1],y=tmpGr[,2],pred5KernDensNum=class1PostVect,pred5KernDens=factor(ifelse(class1PostVect>0.5,"c1","c2")),class1Dens=as.vector(dens1$z),class2Dens=as.vector(dens2$z))
p5kerndensClass<-p0b +
geom_point(mapping=aes(x=x,y=y,col=pred5KernDens),data=tmpClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred5KernDensNum),data=tmpClassDat,breaks=0.5,color="black",lwd=0.8) +
geom_contour(mapping=aes(x=x,y=y,z=class1Dens),data=tmpClassDat,color="red",lwd=0.5) +
geom_contour(mapping=aes(x=x,y=y,z=class2Dens),data=tmpClassDat,color="blue",lwd=0.5) +
ggtitle("Kernel density estimation - classification boundary")
print(p5kerndensClass+coord_fixed(ratio = 1))
pdf("keyTechs_5_kde.pdf",width=15,height=10)
print(p5kerndensClass+theme(text=element_text(size=20)))
dev.off()
## quartz_off_screen
## 2
No R code for this section.
m7lda<-caret::train(class ~ x + y, data = classDat, method = "lda")
newClassDat$pred7lda<-predict(m7lda,newdata=newClassDat)
newClassDat$pred7ldaNum<-as.integer(newClassDat$pred7lda)-1
m7qda<-caret::train(class ~ x + y, data = classDat, method = "qda")
newClassDat$pred7qda<-predict(m7qda,newdata=newClassDat)
newClassDat$pred7qdaNum<-as.integer(newClassDat$pred7qda)-1
p7lda<-p0b +
geom_point(mapping=aes(x=x,y=y,col=pred7lda),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred7ldaNum),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
ggtitle("LDA classification boundary")
p7qda<-p0b +
geom_point(mapping=aes(x=x,y=y,col=pred7qda),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred7qdaNum),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
ggtitle("QDA classification boundary")
grid.arrange(p7lda+coord_fixed(ratio = 1),p7qda+coord_fixed(ratio = 1),ncol=2)
pdf("keyTechs_7_LDAQDA.pdf",width=30,height=10)
grid.arrange(p7lda+theme(text=element_text(size=20)),p7qda+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
m8cartClass<-caret::train(class ~ x + y, data = classDat, method = "rpart", tuneGrid=data.frame(cp=0)) # as this is a simple example, the graph will be more interesting for a fully grown tree, hence set cost-complexiy parameter to 0
newClassDat$pred8cartClass<-predict(m8cartClass,newdata=newClassDat)
newClassDat$pred8cartClassNum<-as.integer(newClassDat$pred8cartClass)-1
p8cartClass<-p0b +
geom_point(mapping=aes(x=x,y=y,col=pred8cartClass),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred8cartClassNum),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
ggtitle("CART decision tree (fully grown) classification boundary")
m8cartReg<-caret::train(y ~ x, data = regDat, method = "rpart")
newRegDat$pred8cartReg<-predict(m8cartReg,newdata=newRegDat)
p8cartReg<-p0a +
geom_line(data=newRegDat,mapping=aes(x=x,y=pred8cartReg), color="blue", linetype="solid") +
ggtitle("CART decision tree regression")
grid.arrange(p8cartClass+coord_fixed(ratio = 1),p8cartReg,ncol=2)
pdf("keyTechs_8_CART.pdf",width=30,height=10)
grid.arrange(p8cartReg+theme(text=element_text(size=20)),p8cartClass+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
m8rfClass<-caret::train(class ~ x + y, data = classDat, method = "rf", tuneGrid=data.frame(mtry=c(1,2)), trainControl=trainControl(method = "repeatedcv", number = 10, repeats = 3))
newClassDat$pred8rfClass<-predict(m8rfClass,newdata=newClassDat,type="raw")
newClassDat$pred8rfClassNum<-predict(m8rfClass,newdata=newClassDat,type="prob")[,1]
p8rfClass<-p0b +
geom_point(mapping=aes(x=x,y=y,col=pred8rfClass),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred8rfClassNum),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
ggtitle("Random forest classification boundary")
m8rfReg<-caret::train(y ~ x, data = regDat, method = "rf", tuneGrid=data.frame(mtry=1), trainControl=trainControl(method = "repeatedcv", number = 10, repeats = 3))
newRegDat$pred8rfReg<-predict(m8rfReg,newdata=newRegDat)
p8rfReg<-p0a +
geom_line(data=newRegDat,mapping=aes(x=x,y=pred8rfReg), color="blue", linetype="solid") +
ggtitle("Random forest regression")
grid.arrange(p8rfClass+coord_fixed(ratio = 1),p8rfReg,ncol=2)
pdf("keyTechs_8_rf.pdf",width=30,height=10)
grid.arrange(p8rfReg+theme(text=element_text(size=20)),p8rfClass+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
# sticking to basic MLPs here
m9mlpClass<-caret::train(class ~ x + y, data = classDat, method = "mlp") # many other methods, e.g. nnet
newClassDat$pred9mlpClass<-predict(m9mlpClass,newdata=newClassDat,type="raw")
newClassDat$pred9mlpClassNum<-predict(m9mlpClass,newdata=newClassDat,type="prob")[,1]
p9mlpClass<-p0b +
geom_point(mapping=aes(x=x,y=y,col=pred9mlpClass),data=newClassDat,size=0.2,alpha=0.25) +
geom_contour(mapping=aes(x=x,y=y,z=pred9mlpClassNum),data=newClassDat,breaks=0.5,color="black",lwd=0.8) +
ggtitle("Neural network (MLP, 1 hidden layer) classification boundary")
m9mlpReg<-caret::train(y ~ x, data = regDat, method = "mlp")
newRegDat$pred9mlpReg<-predict(m9mlpReg,newdata=newRegDat)
p9mlpReg<-p0a +
geom_line(data=newRegDat,mapping=aes(x=x,y=pred9mlpReg), color="blue", linetype="solid") +
ggtitle("Neural network (MLP, 1 hidden layer) regression")
grid.arrange(p9mlpClass+coord_fixed(ratio = 1),p9mlpReg,ncol=2)
pdf("keyTechs_9_neuralnets.pdf",width=30,height=10)
grid.arrange(p9mlpReg+theme(text=element_text(size=20)),p9mlpClass+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
No R code for this section.
tmpDat<-classDat[,1:3]
tmpDat[,2]<-scale(classDat[,2],center=T,scale=T)
tmpDat[,3]<-scale(classDat[,3],center=T,scale=T)
pcaObj<-prcomp(tmpDat[,2:3],retx=T)
pcaDat<-data.frame(class=classDat$class,pcaObj$x)
p11pca<-ggplot(data=pcaDat,mapping=aes(x=PC1,y=PC2,col=class,pch=class)) +
geom_point(size=3) +
xlim(-3,3) + ylim(-3,3) +
ggtitle("PCA - rotated data") +
theme_light()
grid.arrange(p0b+coord_fixed(ratio = 1),p11pca+coord_fixed(ratio = 1),ncol=2)
pdf("keyTechs_11_PCA.pdf",width=30,height=10)
grid.arrange(p0b+theme(text=element_text(size=20)),p11pca+theme(text=element_text(size=20)),ncol=2)
dev.off()
## quartz_off_screen
## 2
No R code for this section.